## Importing the Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv(r"C:\Users\lenovo\Desktop\adult.csv")
df.head()
| age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90 | ? | 77053 | HS-grad | 9 | Widowed | ? | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | <=50K |
| 1 | 82 | Private | 132870 | HS-grad | 9 | Widowed | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | <=50K |
| 2 | 66 | ? | 186061 | Some-college | 10 | Widowed | ? | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | <=50K |
| 3 | 54 | Private | 140359 | 7th-8th | 4 | Divorced | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | <=50K |
| 4 | 41 | Private | 264663 | Some-college | 10 | Separated | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | <=50K |
df.shape
(32561, 15)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 32561 non-null int64 1 workclass 32561 non-null object 2 fnlwgt 32561 non-null int64 3 education 32561 non-null object 4 education.num 32561 non-null int64 5 marital.status 32561 non-null object 6 occupation 32561 non-null object 7 relationship 32561 non-null object 8 race 32561 non-null object 9 sex 32561 non-null object 10 capital.gain 32561 non-null int64 11 capital.loss 32561 non-null int64 12 hours.per.week 32561 non-null int64 13 native.country 32561 non-null object 14 income 32561 non-null object dtypes: int64(6), object(9) memory usage: 3.7+ MB
## Checking for Null Values in DataSet
df.isnull().sum()
age 0 workclass 0 fnlwgt 0 education 0 education.num 0 marital.status 0 occupation 0 relationship 0 race 0 sex 0 capital.gain 0 capital.loss 0 hours.per.week 0 native.country 0 income 0 dtype: int64
## Checking for unique values
df.nunique()
age 73 workclass 9 fnlwgt 21648 education 16 education.num 16 marital.status 7 occupation 15 relationship 6 race 5 sex 2 capital.gain 119 capital.loss 92 hours.per.week 94 native.country 42 income 2 dtype: int64
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 32561.0 | 38.581647 | 13.640433 | 17.0 | 28.0 | 37.0 | 48.0 | 90.0 |
| fnlwgt | 32561.0 | 189778.366512 | 105549.977697 | 12285.0 | 117827.0 | 178356.0 | 237051.0 | 1484705.0 |
| education.num | 32561.0 | 10.080679 | 2.572720 | 1.0 | 9.0 | 10.0 | 12.0 | 16.0 |
| capital.gain | 32561.0 | 1077.648844 | 7385.292085 | 0.0 | 0.0 | 0.0 | 0.0 | 99999.0 |
| capital.loss | 32561.0 | 87.303830 | 402.960219 | 0.0 | 0.0 | 0.0 | 0.0 | 4356.0 |
| hours.per.week | 32561.0 | 40.437456 | 12.347429 | 1.0 | 40.0 | 40.0 | 45.0 | 99.0 |
df['workclass'].value_counts()
Private 22696 Self-emp-not-inc 2541 Local-gov 2093 ? 1836 State-gov 1298 Self-emp-inc 1116 Federal-gov 960 Without-pay 14 Never-worked 7 Name: workclass, dtype: int64
df['occupation'].value_counts()
Prof-specialty 4140 Craft-repair 4099 Exec-managerial 4066 Adm-clerical 3770 Sales 3650 Other-service 3295 Machine-op-inspct 2002 ? 1843 Transport-moving 1597 Handlers-cleaners 1370 Farming-fishing 994 Tech-support 928 Protective-serv 649 Priv-house-serv 149 Armed-Forces 9 Name: occupation, dtype: int64
df['native.country'].value_counts()
United-States 29170 Mexico 643 ? 583 Philippines 198 Germany 137 Canada 121 Puerto-Rico 114 El-Salvador 106 India 100 Cuba 95 England 90 Jamaica 81 South 80 China 75 Italy 73 Dominican-Republic 70 Vietnam 67 Guatemala 64 Japan 62 Poland 60 Columbia 59 Taiwan 51 Haiti 44 Iran 43 Portugal 37 Nicaragua 34 Peru 31 Greece 29 France 29 Ecuador 28 Ireland 24 Hong 20 Cambodia 19 Trinadad&Tobago 19 Laos 18 Thailand 18 Yugoslavia 16 Outlying-US(Guam-USVI-etc) 14 Hungary 13 Honduras 13 Scotland 12 Holand-Netherlands 1 Name: native.country, dtype: int64
df['marital.status'].value_counts()
Married-civ-spouse 14976 Never-married 10683 Divorced 4443 Separated 1025 Widowed 993 Married-spouse-absent 418 Married-AF-spouse 23 Name: marital.status, dtype: int64
df['sex'].value_counts()
Male 21790 Female 10771 Name: sex, dtype: int64
df['race'].value_counts()
White 27816 Black 3124 Asian-Pac-Islander 1039 Amer-Indian-Eskimo 311 Other 271 Name: race, dtype: int64
df['income'].value_counts()
<=50K 24720 >50K 7841 Name: income, dtype: int64
df['education'].value_counts()
HS-grad 10501 Some-college 7291 Bachelors 5355 Masters 1723 Assoc-voc 1382 11th 1175 Assoc-acdm 1067 10th 933 7th-8th 646 Prof-school 576 9th 514 12th 433 Doctorate 413 5th-6th 333 1st-4th 168 Preschool 51 Name: education, dtype: int64
sns.countplot(df['income'], palette='coolwarm', hue='relationship', data=df);
# Filling ? values
# replacing the values with mode
df['workclass'] = df['workclass'].replace('?', 'Private')
df['occupation'] = df['occupation'].replace('?', 'Prof-specialty')
df['native.country'] = df['native.country'].replace('?', 'United-States')
df.head()
| age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90 | Private | 77053 | HS-grad | 9 | Widowed | Prof-specialty | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | <=50K |
| 1 | 82 | Private | 132870 | HS-grad | 9 | Widowed | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | <=50K |
| 2 | 66 | Private | 186061 | Some-college | 10 | Widowed | Prof-specialty | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | <=50K |
| 3 | 54 | Private | 140359 | 7th-8th | 4 | Divorced | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | <=50K |
| 4 | 41 | Private | 264663 | Some-college | 10 | Separated | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | <=50K |
# education Category
df.education= df.education.replace(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th','10th', '11th', '12th'], 'school')
df.education = df.education.replace('HS-grad', 'high school')
df.education = df.education.replace(['Assoc-voc', 'Assoc-acdm', 'Prof-school', 'Some-college'], 'higher')
df.education = df.education.replace('Bachelors', 'undergrad')
df.education = df.education.replace('Masters', 'grad')
df.education = df.education.replace('Doctorate', 'doc')
# martial status
df['marital.status']= df['marital.status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'married')
df['marital.status']= df['marital.status'].replace(['Never-married'], 'not-married')
df['marital.status']= df['marital.status'].replace(['Divorced', 'Separated','Widowed',
'Married-spouse-absent'], 'other')
# income
df.income = df.income.replace('<=50K', 0)
df.income = df.income.replace('>50K', 1)
df.head()
| age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90 | Private | 77053 | high school | 9 | other | Prof-specialty | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | 0 |
| 1 | 82 | Private | 132870 | high school | 9 | other | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | 0 |
| 2 | 66 | Private | 186061 | higher | 10 | other | Prof-specialty | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | 0 |
| 3 | 54 | Private | 140359 | school | 4 | other | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | 0 |
| 4 | 41 | Private | 264663 | higher | 10 | other | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | 0 |
df['marital.status'].value_counts()
married 14999 not-married 10683 other 6879 Name: marital.status, dtype: int64
df['education'].value_counts()
high school 10501 higher 10316 undergrad 5355 school 4253 grad 1723 doc 413 Name: education, dtype: int64
df.corr()
| age | fnlwgt | education.num | capital.gain | capital.loss | hours.per.week | income | |
|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.076646 | 0.036527 | 0.077674 | 0.057775 | 0.068756 | 0.234037 |
| fnlwgt | -0.076646 | 1.000000 | -0.043195 | 0.000432 | -0.010252 | -0.018768 | -0.009463 |
| education.num | 0.036527 | -0.043195 | 1.000000 | 0.122630 | 0.079923 | 0.148123 | 0.335154 |
| capital.gain | 0.077674 | 0.000432 | 0.122630 | 1.000000 | -0.031615 | 0.078409 | 0.223329 |
| capital.loss | 0.057775 | -0.010252 | 0.079923 | -0.031615 | 1.000000 | 0.054256 | 0.150526 |
| hours.per.week | 0.068756 | -0.018768 | 0.148123 | 0.078409 | 0.054256 | 1.000000 | 0.229689 |
| income | 0.234037 | -0.009463 | 0.335154 | 0.223329 | 0.150526 | 0.229689 | 1.000000 |
sns.heatmap(df.corr(), annot=True);
df.hist(figsize=(12,12), layout=(3,3), sharex=False);
df.plot(kind='box', figsize=(12,12), layout=(3,3), sharex=False, subplots=True);
px.pie(df, values='education.num', names='education', title='% of edu',
color_discrete_sequence = px.colors.qualitative.T10)
sns.countplot(df['education'], hue='sex', data=df, palette='seismic');
X= df.drop(['income'], axis=1)
y = df['income']
from sklearn.preprocessing import StandardScaler, LabelEncoder
df1= df.copy()
df1= df1.apply(LabelEncoder().fit_transform)
df1.head()
| age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 72 | 3 | 2649 | 2 | 8 | 2 | 9 | 1 | 4 | 0 | 0 | 91 | 39 | 38 | 0 |
| 1 | 65 | 3 | 6514 | 2 | 8 | 2 | 3 | 1 | 4 | 0 | 0 | 91 | 17 | 38 | 0 |
| 2 | 49 | 3 | 11175 | 3 | 9 | 2 | 9 | 4 | 2 | 0 | 0 | 91 | 39 | 38 | 0 |
| 3 | 37 | 3 | 7009 | 4 | 3 | 2 | 6 | 4 | 4 | 0 | 0 | 90 | 39 | 38 | 0 |
| 4 | 24 | 3 | 16850 | 3 | 9 | 2 | 9 | 3 | 4 | 0 | 0 | 90 | 39 | 38 | 0 |
ss= StandardScaler().fit(df1.drop('income', axis=1))
X= ss.transform(df1.drop('income', axis=1))
y= df['income']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
prediction = model.predict(X_test)
print("Acc on training data: {:,.3f}".format(lr.score(X_train, y_train)))
print("Acc on test data: {:,.3f}".format(lr.score(X_test, y_test)))
Acc on training data: 0.839 Acc on test data: 0.836
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model1 = rfc.fit(X_train, y_train)
prediction1 = model1.predict(X_test)
print("Acc on training data: {:,.3f}".format(rfc.score(X_train, y_train)))
print("Acc on test data: {:,.3f}".format(rfc.score(X_test, y_test)))
Acc on training data: 1.000 Acc on test data: 0.854
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(y_test, prediction1))
[[6883 523] [ 903 1460]]
print(classification_report(y_test, prediction1))
precision recall f1-score support
0 0.88 0.93 0.91 7406
1 0.74 0.62 0.67 2363
accuracy 0.85 9769
macro avg 0.81 0.77 0.79 9769
weighted avg 0.85 0.85 0.85 9769
#Precision: tp/tp+fp
print('Precision =' , 10332/(10332+1286))
Precision = 0.8893096918574626
# recall= tp/tp+fn
print('Recall =', 10332/(10332+806))
Recall = 0.9276351230023343
print('Precision = ', 2229/(2229+806))
Precision = 0.7344316309719934
print('Recall= ', 2229/(2229+1286))
Recall= 0.6341394025604552
- - - - - - - - X X X X X X X X - - - - - - - -